{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f33f87f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/util_simple_classifier.ipynb\"\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "13b018b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize\n",
    "from sklearn.cluster import KMeans\n",
    "from nltk.probability import FreqDist\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "from joblib import dump, load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ddfe5bb7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Repo card metadata block was not found. Setting CardData to empty.\n",
      "Repo card metadata block was not found. Setting CardData to empty.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label     label_text\n",
      "0     wales want rugby league training wales could f...      2          sport\n",
      "1     china aviation seeks rescue deal scandal-hit j...      1       business\n",
      "2     rock band u2 break ticket record u2 have smash...      3  entertainment\n",
      "3     markets signal brazilian recovery the brazilia...      1       business\n",
      "4     tough rules for ringtone sellers firms that fl...      0           tech\n",
      "...                                                 ...    ...            ...\n",
      "1220  us economy shows solid gdp growth the us econo...      1       business\n",
      "1221  microsoft releases bumper patches microsoft ha...      0           tech\n",
      "1222  stuart joins norwich from addicks norwich have...      2          sport\n",
      "1223  why few targets are better than many the econo...      1       business\n",
      "1224  boothroyd calls for lords speaker betty boothr...      4       politics\n",
      "\n",
      "[1225 rows x 3 columns]\n",
      "                                                  text  label     label_text\n",
      "0    carry on star patsy rowlands dies actress pats...      3  entertainment\n",
      "1    sydney to host north v south game sydney will ...      2          sport\n",
      "2    uk coal plunges into deeper loss shares in uk ...      1       business\n",
      "3    blair joins school sailing trip the prime mini...      4       politics\n",
      "4    bath faced with tindall ultimatum mike tindall...      2          sport\n",
      "..                                                 ...    ...            ...\n",
      "995  mobile multimedia slow to catch on there is no...      0           tech\n",
      "996  owen determined to stay in madrid england forw...      2          sport\n",
      "997  mobile tv tipped as one to watch scandinavians...      0           tech\n",
      "998  stormy year for property insurers a string of ...      1       business\n",
      "999  what the election should really be about  a ge...      4       politics\n",
      "\n",
      "[1000 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "# Load dataset\n",
    "train_dataset = load_dataset(\"SetFit/bbc-news\", split=\"train\")\n",
    "test_dataset = load_dataset(\"SetFit/bbc-news\", split=\"test\")\n",
    "train_df = train_dataset.to_pandas()\n",
    "test_df = test_dataset.to_pandas()\n",
    "print(train_df)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "53a7fd0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               text  label\n",
      "label_text                \n",
      "business        286    286\n",
      "entertainment   210    210\n",
      "politics        242    242\n",
      "sport           275    275\n",
      "tech            212    212\n",
      "               text  label\n",
      "label_text                \n",
      "business        224    224\n",
      "entertainment   176    176\n",
      "politics        175    175\n",
      "sport           236    236\n",
      "tech            189    189\n"
     ]
    }
   ],
   "source": [
    "# See the distribution of classes\n",
    "print(train_df.groupby('label_text').count())\n",
    "print(test_df.groupby('label_text').count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "173dea18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               text  label\n",
      "label_text                \n",
      "business        408    408\n",
      "entertainment   309    309\n",
      "politics        333    333\n",
      "sport           409    409\n",
      "tech            321    321\n",
      "               text  label\n",
      "label_text                \n",
      "business        102    102\n",
      "entertainment    77     77\n",
      "politics         84     84\n",
      "sport           102    102\n",
      "tech             80     80\n"
     ]
    }
   ],
   "source": [
    "# Combine train and test dataframes and create a better train/test split\n",
    "combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)\n",
    "sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)\n",
    "train_index, test_index = next(sss.split(combined_df[\"text\"], combined_df[\"label\"]))\n",
    "train_df = combined_df[combined_df.index.isin(train_index)].copy()\n",
    "test_df = combined_df[combined_df.index.isin(test_index)].copy()\n",
    "print(train_df.groupby('label_text').count())\n",
    "print(test_df.groupby('label_text').count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fbe407e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label     label_text  \\\n",
      "0     wales want rugby league training wales could f...      2          sport   \n",
      "1     china aviation seeks rescue deal scandal-hit j...      1       business   \n",
      "2     rock band u2 break ticket record u2 have smash...      3  entertainment   \n",
      "3     markets signal brazilian recovery the brazilia...      1       business   \n",
      "4     tough rules for ringtone sellers firms that fl...      0           tech   \n",
      "...                                                 ...    ...            ...   \n",
      "2217  soros group warns of kazakh close the open soc...      1       business   \n",
      "2218  election  could be terror target  terrorists m...      4       politics   \n",
      "2219  lifestyle  governs mobile choice  faster  bett...      0           tech   \n",
      "2220  mobile multimedia slow to catch on there is no...      0           tech   \n",
      "2221  owen determined to stay in madrid england forw...      2          sport   \n",
      "\n",
      "                                         text_tokenized  \n",
      "0     [wales, want, rugby, league, training, wales, ...  \n",
      "1     [china, aviation, seeks, rescue, deal, scandal...  \n",
      "2     [rock, band, u2, break, ticket, record, u2, sm...  \n",
      "3     [markets, signal, brazilian, recovery, brazili...  \n",
      "4     [tough, rules, ringtone, sellers, firms, flout...  \n",
      "...                                                 ...  \n",
      "2217  [soros, group, warns, kazakh, close, open, soc...  \n",
      "2218  [election, could, terror, target, terrorists, ...  \n",
      "2219  [lifestyle, governs, mobile, choice, faster, b...  \n",
      "2220  [mobile, multimedia, slow, catch, doubt, mobil...  \n",
      "2221  [owen, determined, stay, madrid, england, forw...  \n",
      "\n",
      "[1780 rows x 4 columns]\n",
      "                                                   text  label     label_text  \\\n",
      "7     dirty den s demise seen by 14m more than 14 mi...      3  entertainment   \n",
      "8     lib dems  new election pr chief the lib dems h...      4       politics   \n",
      "16    pearce keen on succeeding keegan joint assista...      2          sport   \n",
      "18    ask jeeves tips online ad revival ask jeeves h...      1       business   \n",
      "20    ministers deny care sums  wrong  ministers hav...      4       politics   \n",
      "...                                                 ...    ...            ...   \n",
      "2207  labour accused of  eu propaganda  a  taxpayer ...      4       politics   \n",
      "2209  gronkjaer agrees switch to madrid jesper gronk...      2          sport   \n",
      "2222  mobile tv tipped as one to watch scandinavians...      0           tech   \n",
      "2223  stormy year for property insurers a string of ...      1       business   \n",
      "2224  what the election should really be about  a ge...      4       politics   \n",
      "\n",
      "                                         text_tokenized  \n",
      "7     [dirty, den, demise, seen, 14m, 14, million, p...  \n",
      "8     [lib, dems, new, election, pr, chief, lib, dem...  \n",
      "16    [pearce, keen, succeeding, keegan, joint, assi...  \n",
      "18    [ask, jeeves, tips, online, ad, revival, ask, ...  \n",
      "20    [ministers, deny, care, sums, wrong, ministers...  \n",
      "...                                                 ...  \n",
      "2207  [labour, accused, eu, propaganda, taxpayer, su...  \n",
      "2209  [gronkjaer, agrees, switch, madrid, jesper, gr...  \n",
      "2222  [mobile, tv, tipped, one, watch, scandinavians...  \n",
      "2223  [stormy, year, property, insurers, string, sto...  \n",
      "2224  [election, really, general, election, best, ch...  \n",
      "\n",
      "[445 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Preprocess the data\n",
    "train_df = tokenize(train_df, \"text\")\n",
    "train_df = remove_stopword_punct(train_df, \"text_tokenized\")\n",
    "test_df = tokenize(test_df, \"text\")\n",
    "test_df = remove_stopword_punct(test_df, \"text_tokenized\")\n",
    "print(train_df)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8f83c5a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the training data and create the vectorizer\n",
    "train_df[\"text_clean\"] = train_df[\"text_tokenized\"].apply(lambda x: \" \".join(list(x)))\n",
    "test_df[\"text_clean\"] = test_df[\"text_tokenized\"].apply(lambda x: \" \".join(list(x)))\n",
    "train_df.to_json(\"../data/bbc_train.json\")\n",
    "test_df.to_json(\"../data/bbc_test.json\")\n",
    "vec = TfidfVectorizer(ngram_range=(1,3))\n",
    "matrix = vec.fit_transform(train_df[\"text_clean\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c765de13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeans(n_clusters=5, n_init=10)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">KMeans</label><div class=\"sk-toggleable__content\"><pre>KMeans(n_clusters=5, n_init=10)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "KMeans(n_clusters=5, n_init=10)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cluster the data\n",
    "km = KMeans(n_clusters=5, n_init=10)\n",
    "km.fit(matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "26587ccf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_most_frequent_words(text, num_words):\n",
    "    word_list = word_tokenize(text)\n",
    "    freq_dist = FreqDist(word_list)\n",
    "    top_words = freq_dist.most_common(num_words)\n",
    "    top_words = [word[0] for word in top_words]\n",
    "    return top_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "69f80f43",
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_most_common_words_by_cluster(input_df, km, num_clusters):\n",
    "    clusters = km.labels_.tolist()\n",
    "    input_df[\"cluster\"] = clusters\n",
    "    for cluster in range(0, num_clusters):\n",
    "        this_cluster_text = input_df[input_df['cluster'] == cluster]\n",
    "        all_text = \" \".join(this_cluster_text['text_clean'].astype(str))\n",
    "        top_200 = get_most_frequent_words(all_text, 200)\n",
    "        print(cluster)\n",
    "        print(top_200)\n",
    "    return input_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "da49e46f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "['said', 'mr', 'would', 'us', 'year', 'government', 'also', 'new', 'people', 'could', 'last', 'one', 'uk', 'market', 'years', 'growth', 'company', '000', 'economy', 'two', 'told', 'world', 'bank', 'may', 'economic', 'sales', 'time', 'however', 'first', 'minister', 'country', 'european', 'made', 'firm', 'chief', 'law', 'public', 'many', 'deal', '2004', 'expected', 'eu', 'china', 'three', 'since', 'business', 'next', 'prices', 'says', 'work', 'say', 'group', 'make', 'months', 'london', 'added', 'shares', 'plans', 'still', 'party', 'countries', 'executive', 'foreign', 'report', 'take', 'rise', 'lord', 'set', 'bbc', 'companies', 'much', 'news', 'back', 'week', 'oil', 'state', 'financial', 'figures', 'europe', 'firms', 'general', 'british', 'december', 'well', 'secretary', 'police', 'home', 'jobs', 'president', 'part', 'get', 'spending', 'spokesman', 'increase', 'trade', 'rate', 'stock', 'interest', 'blair', 'analysts', 'move', 'budget', 'rights', 'way', 'money', 'dollar', 'decision', 'house', 'labour', 'high', 'month', 'case', 'national', 'biggest', 'saying', 'end', 'office', 'strong', 'court', 'million', 'offer', 'rates', 'put', '2005', 'meeting', 'bill', 'already', 'even', 'legal', 'election', 'think', 'pay', 'want', 'go', 'demand', 'international', 'see', 'bid', 'costs', 'like', 'investment', 'current', 'likely', 'statement', 'ukip', 'action', 'children', 'good', 'tax', 'ms', 'price', 'number', 'former', 'ministers', 'earlier', 'help', 'come', 'debt', 'exchange', 'system', 'cut', 'going', 'hit', 'according', 'cost', 'commission', 'without', 'human', 'need', 'recent', 'india', 'higher', 'believe', 'euros', 'committee', 'quarter', 'industry', 'japan', 'kilroy-silk', 'january', 'profits', 'rose', 'share', 'local', '2003', 'policy', '10', 'announced', 'future', 'south', 'health', 'whether', 'despite', 'political', 'support', 'evidence', 'plan', 'investors', 'day', 'parliament']\n",
      "1\n",
      "['said', 'game', 'first', 'england', 'win', 'world', 'time', 'last', 'one', 'two', 'would', 'play', 'also', 'back', 'players', 'games', 'new', 'cup', 'year', 'get', 'good', 'team', 'could', 'wales', 'side', 'second', 'six', 'match', 'france', 'club', 'us', 'ireland', 'well', 'three', 'coach', 'set', 'injury', 'made', 'going', 'final', 'like', 'added', 'season', 'rugby', 'years', 'open', 'think', 'way', 'make', 'great', 'take', 'playing', 'got', 'victory', 'player', 'chelsea', 'league', 'go', 'played', 'nations', 'best', 'united', 'still', 'since', 'scotland', 'start', 'minutes', 'arsenal', 'next', 'number', 'told', 'champion', 'four', 'home', 'want', 'people', 'roddick', 'top', 'lot', 'week', 'come', 'williams', 'chance', 'beat', 'really', 'know', 'squad', 'manager', 'right', 'international', 'another', 'title', 'robinson', 'goal', 'football', 'five', 'saturday', 'bbc', 'end', 'even', 'real', 'try', 'third', 'came', 'ball', 'winning', 'former', 'much', 'olympic', 'put', 'left', 'see', 'grand', 'return', 'took', 'half', 'away', 'place', 'better', 'italy', 'champions', 'sport', 'liverpool', 'points', 'break', 'race', 'australian', 'many', 'big', 'lost', 'record', 'britain', 'round', 'ahead', 'european', 'sunday', 'zealand', 'training', 'andy', 'captain', 'work', 'despite', 'long', 'part', 'seed', 'boss', 'early', 'mark', 'bit', 'v', 'lead', '10', 'jones', 'premiership', 'says', 'far', 'run', 'fans', 'never', 'looking', 'defeat', 'things', 'form', 'spain', 'j', 'something', 'went', 'hard', 'manchester', 'slam', 'every', 'madrid', 'forward', 'give', 'days', 'past', 'need', 'french', 'face', 'always', 'point', 'tennis', 'event', 'matches', 'used', 'thing', 'little', 'fourth', 'pressure', 'nadal', 'behind', 'day', 'british', 'move', 'centre', 'american', 'admitted', 'city', 'look', 'country']\n",
      "2\n",
      "['said', 'film', 'best', 'music', 'year', 'us', 'also', 'one', 'new', 'awards', 'uk', 'show', 'last', 'number', 'first', 'award', 'top', 'years', 'band', 'two', 'people', 'actor', 'star', 'would', 'british', 'album', 'mr', 'tv', 'films', 'song', 'director', 'world', 'three', 'including', 'time', 'actress', 'bbc', 'million', 'chart', '000', 'singer', 'festival', 'like', 'made', 'record', 'prize', 'comedy', 'well', 'rock', 'five', 'make', 'series', 'office', 'movie', 'stars', 'sales', 'group', 'box', 'role', 'took', 'get', 'aviator', 'hit', 'industry', 'win', 'think', 'radio', 'week', 'life', 'nominations', 'four', 'oscar', 'london', 'could', '2004', 'place', 'many', 'pop', 'second', 'hollywood', 'added', 'play', 'set', 'list', 'good', '10', 'told', 'single', 'ceremony', 'really', 'theatre', 'include', 'success', 'love', 'since', 'man', 'book', 'says', 'take', 'fans', 'next', 'go', 'named', 'going', 'great', 'musical', 'sold', 'nominated', 'day', 'category', 'american', 'released', 'tour', 'children', 'went', 'big', 'academy', 'career', 'night', 'company', 'shows', 'drama', 'part', 'artists', 'played', 'starring', 'charles', 'urban', 'act', 'work', 'songs', 'may', 'original', 'black', 'audience', 'february', 'got', 'news', 'international', 'television', 'oscars', 'back', 'become', 'release', '2', 'end', 'much', 'way', 'see', 'live', 'came', 'young', 'former', 'martin', 'held', '25', 'stone', 'winner', 'already', 'debut', 'third', 'singles', 'christmas', 'included', 'stage', 'john', 'taking', 'around', 'money', 'winners', 'los', 'know', 'game', 'among', 'baby', 'ray', 'sony', 'however', 'favourite', 'later', '20', 'performance', 'hip-hop', 'come', 'want', 'screen', 'right', 'elvis', 'seen', 'public', 'making', 'producers', 'name', 'angels', 'supporting', 'michael', 'video', 'dollar', 'version', 'story']\n",
      "3\n",
      "['mr', 'said', 'labour', 'would', 'election', 'blair', 'party', 'brown', 'howard', 'government', 'tax', 'minister', 'people', 'prime', 'also', 'tory', 'chancellor', 'tories', 'plans', 'told', 'britain', 'leader', 'campaign', 'new', 'say', 'could', 'lib', 'public', 'tony', 'kennedy', 'next', 'michael', 'one', 'liberal', 'bbc', 'general', 'conservative', 'taxes', 'issue', 'uk', 'gordon', 'council', 'voters', 'secretary', 'spending', 'country', 'home', 'make', 'claims', 'going', 'get', 'local', 'immigration', 'dems', 'back', 'says', 'two', 'mps', 'time', 'economy', 'saying', 'cards', 'may', 'conservatives', 'spokesman', 'services', 'first', 'added', 'budget', 'milburn', 'year', 'democrats', 'way', 'id', 'trust', 'take', 'years', 'like', 'expected', 'iraq', 'asylum', 'last', 'vote', 'made', 'cuts', 'economic', 'good', 'part', 'parties', 'come', 'money', 'dem', 'us', 'cut', 'support', 'cabinet', 'asked', 'speech', 'think', 'see', 'role', '000', 'charles', 'news', 'whether', 'shadow', 'war', 'stand', 'programme', 'go', 'world', 'political', 'today', 'increase', 'mp', 'put', 'insisted', 'politics', 'bill', 'system', 'set', 'big', 'help', 'policy', 'win', 'plan', 'believe', 'policies', 'much', 'pay', 'give', 'week', 'claim', 'opposition', 'british', 'clear', 'conference', 'day', 'change', 'former', 'even', 'still', 'monday', 'right', 'lord', 'third', 'many', 'campbell', 'book', 'education', 'already', 'personal', 'poster', 'letwin', 'john', 'income', 'democrat', 'needed', 'manifesto', 'radio', 'decision', 'argued', 'poll', 'ahead', 'issues', 'number', 'proposals', 'want', 'debate', 'ministers', '5', 'commons', 'term', '4', 'nothing', 'campaigning', 'three', 'work', 'given', 'well', 'treasury', 'real', 'clarke', 'key', 'know', 'away', 'claimed', 'sir', 'members', 'choice', 'accused', 'failed', 'candidate', 'chairman', 'power', 'majority', 'got', 'seen', 'ever', 'called']\n",
      "4\n",
      "['said', 'people', 'mr', 'also', 'technology', 'would', 'new', 'mobile', 'one', 'could', 'users', 'us', 'net', 'digital', 'use', 'software', 'many', 'music', 'make', 'phone', 'microsoft', 'like', 'get', 'year', 'information', 'used', 'uk', 'firm', 'tv', 'service', 'broadband', 'data', 'online', 'computer', 'video', 'services', 'internet', 'way', 'company', 'apple', 'million', 'search', 'content', 'using', 'system', 'first', 'games', 'time', 'world', 'firms', 'market', 'number', 'says', 'phones', 'security', 'according', 'companies', 'web', 'last', 'networks', 'work', 'news', 'next', 'much', 'site', 'industry', 'access', 'made', 'pc', 'media', 'network', 'two', 'want', 'research', 'sites', 'help', 'players', 'devices', 'take', '000', 'consumers', 'see', 'yukos', 'around', 'e-mail', 'home', 'years', 'already', 'different', 'sony', 'able', 'going', 'set', 'added', 'bt', 'website', 'part', 'bbc', 'show', 'found', 'need', 'windows', 'go', 'still', 'months', 'well', 'may', 'systems', 'told', 'available', 'future', 'even', 'report', 'legal', 'customers', 'google', 'europe', 'pcs', 'three', 'technologies', 'websites', 'radio', 'files', 'images', 'version', 'group', 'end', 'messages', 'program', 'personal', 'court', 'consumer', 'gadget', 'become', 'good', 'control', 'mobiles', 'gadgets', 'currently', 'hard', 'via', 'business', 'means', 'spam', 'likely', 'move', 'computers', 'put', 'mac', '2004', 'machines', 'money', 'say', 'back', 'working', 'big', 'popular', 'find', 'high-definition', '10', 'called', 'latest', 'offer', 'programs', 'game', 'dvd', 'download', 'wireless', 'analysts', 'virus', 'expected', 'getting', 'chief', 'device', 'every', 'generation', 'making', 'several', 'play', 'although', 'ipod', 'products', 'lot', 'important', 'however', 'less', 'gaming', 'power', 'player', 'since', 'another', 'give', 'really', 'look', 'might', 'free', 'mini', 'operators', 'cost', 'sold']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "      <th>label_text</th>\n",
       "      <th>text_tokenized</th>\n",
       "      <th>text_clean</th>\n",
       "      <th>cluster</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>wales want rugby league training wales could f...</td>\n",
       "      <td>2</td>\n",
       "      <td>sport</td>\n",
       "      <td>[wales, want, rugby, league, training, wales, ...</td>\n",
       "      <td>wales want rugby league training wales could f...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>china aviation seeks rescue deal scandal-hit j...</td>\n",
       "      <td>1</td>\n",
       "      <td>business</td>\n",
       "      <td>[china, aviation, seeks, rescue, deal, scandal...</td>\n",
       "      <td>china aviation seeks rescue deal scandal-hit j...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rock band u2 break ticket record u2 have smash...</td>\n",
       "      <td>3</td>\n",
       "      <td>entertainment</td>\n",
       "      <td>[rock, band, u2, break, ticket, record, u2, sm...</td>\n",
       "      <td>rock band u2 break ticket record u2 smashed ir...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>markets signal brazilian recovery the brazilia...</td>\n",
       "      <td>1</td>\n",
       "      <td>business</td>\n",
       "      <td>[markets, signal, brazilian, recovery, brazili...</td>\n",
       "      <td>markets signal brazilian recovery brazilian st...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>tough rules for ringtone sellers firms that fl...</td>\n",
       "      <td>0</td>\n",
       "      <td>tech</td>\n",
       "      <td>[tough, rules, ringtone, sellers, firms, flout...</td>\n",
       "      <td>tough rules ringtone sellers firms flout rules...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2217</th>\n",
       "      <td>soros group warns of kazakh close the open soc...</td>\n",
       "      <td>1</td>\n",
       "      <td>business</td>\n",
       "      <td>[soros, group, warns, kazakh, close, open, soc...</td>\n",
       "      <td>soros group warns kazakh close open society in...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2218</th>\n",
       "      <td>election  could be terror target  terrorists m...</td>\n",
       "      <td>4</td>\n",
       "      <td>politics</td>\n",
       "      <td>[election, could, terror, target, terrorists, ...</td>\n",
       "      <td>election could terror target terrorists might ...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2219</th>\n",
       "      <td>lifestyle  governs mobile choice  faster  bett...</td>\n",
       "      <td>0</td>\n",
       "      <td>tech</td>\n",
       "      <td>[lifestyle, governs, mobile, choice, faster, b...</td>\n",
       "      <td>lifestyle governs mobile choice faster better ...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2220</th>\n",
       "      <td>mobile multimedia slow to catch on there is no...</td>\n",
       "      <td>0</td>\n",
       "      <td>tech</td>\n",
       "      <td>[mobile, multimedia, slow, catch, doubt, mobil...</td>\n",
       "      <td>mobile multimedia slow catch doubt mobile phon...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2221</th>\n",
       "      <td>owen determined to stay in madrid england forw...</td>\n",
       "      <td>2</td>\n",
       "      <td>sport</td>\n",
       "      <td>[owen, determined, stay, madrid, england, forw...</td>\n",
       "      <td>owen determined stay madrid england forward mi...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1780 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   text  label     label_text  \\\n",
       "0     wales want rugby league training wales could f...      2          sport   \n",
       "1     china aviation seeks rescue deal scandal-hit j...      1       business   \n",
       "2     rock band u2 break ticket record u2 have smash...      3  entertainment   \n",
       "3     markets signal brazilian recovery the brazilia...      1       business   \n",
       "4     tough rules for ringtone sellers firms that fl...      0           tech   \n",
       "...                                                 ...    ...            ...   \n",
       "2217  soros group warns of kazakh close the open soc...      1       business   \n",
       "2218  election  could be terror target  terrorists m...      4       politics   \n",
       "2219  lifestyle  governs mobile choice  faster  bett...      0           tech   \n",
       "2220  mobile multimedia slow to catch on there is no...      0           tech   \n",
       "2221  owen determined to stay in madrid england forw...      2          sport   \n",
       "\n",
       "                                         text_tokenized  \\\n",
       "0     [wales, want, rugby, league, training, wales, ...   \n",
       "1     [china, aviation, seeks, rescue, deal, scandal...   \n",
       "2     [rock, band, u2, break, ticket, record, u2, sm...   \n",
       "3     [markets, signal, brazilian, recovery, brazili...   \n",
       "4     [tough, rules, ringtone, sellers, firms, flout...   \n",
       "...                                                 ...   \n",
       "2217  [soros, group, warns, kazakh, close, open, soc...   \n",
       "2218  [election, could, terror, target, terrorists, ...   \n",
       "2219  [lifestyle, governs, mobile, choice, faster, b...   \n",
       "2220  [mobile, multimedia, slow, catch, doubt, mobil...   \n",
       "2221  [owen, determined, stay, madrid, england, forw...   \n",
       "\n",
       "                                             text_clean  cluster  \n",
       "0     wales want rugby league training wales could f...        1  \n",
       "1     china aviation seeks rescue deal scandal-hit j...        0  \n",
       "2     rock band u2 break ticket record u2 smashed ir...        2  \n",
       "3     markets signal brazilian recovery brazilian st...        0  \n",
       "4     tough rules ringtone sellers firms flout rules...        4  \n",
       "...                                                 ...      ...  \n",
       "2217  soros group warns kazakh close open society in...        0  \n",
       "2218  election could terror target terrorists might ...        3  \n",
       "2219  lifestyle governs mobile choice faster better ...        4  \n",
       "2220  mobile multimedia slow catch doubt mobile phon...        4  \n",
       "2221  owen determined stay madrid england forward mi...        1  \n",
       "\n",
       "[1780 rows x 6 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print_most_common_words_by_cluster(train_df, km, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "21b4ff9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lib dems  new election pr chief the lib dems have appointed a senior figure from bt to be the party s new communications chief for their next general election effort.  sandy walkington will now work with senior figures such as matthew taylor on completing the party manifesto. party chief executive lord rennard said the appointment was a  significant strengthening of the lib dem team . mr walkington said he wanted the party to be ready for any  mischief  rivals or the media tried to throw at it.   my role will be to ensure this new public profile is effectively communicated at all levels   he said.  i also know the party will be put under scrutiny in the media and from the other parties as never before - and we will need to show ourselves ready and prepared to counter the mischief and misrepresentation that all too often comes from the party s opponents.  the party is already demonstrating on every issue that it is the effective opposition.  mr walkington s new job title is director of general election communications.\n",
      "[3]\n"
     ]
    }
   ],
   "source": [
    "test_example = test_df.iloc[1, test_df.columns.get_loc('text')]\n",
    "print(test_example)\n",
    "vectorized = vec.transform([test_example])\n",
    "prediction = km.predict(vectorized)\n",
    "print(prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "60ffa59f-5812-4eec-8928-f726a3b9d105",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3]\n"
     ]
    }
   ],
   "source": [
    "dump(km, '../data/kmeans.joblib')\n",
    "km_ = load('../data/kmeans.joblib')\n",
    "prediction = km_.predict(vectorized)\n",
    "print(prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c203093-ec0c-4b77-afab-85c2cf796f4d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
